*-------------------------------------------------------------------------------*
* DIRECTORIES
*-------------------------------------------------------------------------------*
clear

local l_direc_data `1'
local l_direc_code `2' 
local l_direc_figtab `3' 

local direc_data_raw  `l_direc_data'/raw
local direc_data_clean `l_direc_data'/clean



*-------------------------------------------------------------------------------*
* AUTH AFFIL INFO
*-------------------------------------------------------------------------------*
qui forval i=1/268 {
	if strlen("`i'")==1 {
		local filenum = "00" + "`i'"
	}
	if strlen("`i'")==2 {
		local filenum = "0" + "`i'"		
	}
	if strlen("`i'")==3 {
		local filenum = "`i'"		
	}
	
	import delimited "`direc_data_raw'/editor_affil/editors_affil_ngram_part_`filenum'.csv", ///
		bindquote(strict) varnames(1) encoding(UTF-8) clear 
	
	tempfile T`i'
	save `T`i'', replace
	nois di "`i' in"
}

use `T1', clear
forval i=2/268 {
	append using `T`i'' 
	nois di "`i' appended"
}
gduplicates drop

rename authid author_id
rename term affil_term
rename term_count affil_count

compress
save "`direc_data_clean'/authid_affil.dta", replace //authorid terms




*-------------------------------------------------------------------------------*
* PREP AUTHOR-PMID
*-------------------------------------------------------------------------------*
local l_filelist : dir "`direc_data_raw'/authority_long" files "*.dta"
local cnt = 1
qui foreach l_file in `l_filelist' {
	
	if `cnt'==1 | `cnt'==1000 | `cnt'==2000 | `cnt'==3000 | `cnt'==4000 | `cnt'==5000 | `cnt'==6000 | `cnt'==7000 | `cnt'==8000 | `cnt'==9000 {
		use "`direc_data_raw'/authority_long/`l_file'", clear
	}
	else {
		append using "`direc_data_raw'/authority_long/`l_file'"
	}
	
	if `cnt'==999 | `cnt'==1999 | `cnt'==2999 | `cnt'==3999 | `cnt'==4999 | `cnt'==5999 | `cnt'==6999 | `cnt'==7999 | `cnt'==8999 {
		
		tempfile T`cnt'
		save `T`cnt'', replace
		clear
	}
	
	nois di "`cnt'"
	local cnt = `cnt'+1
}
foreach cnt in 999 1999 2999 3999 4999 5999 6999 7999 8999 {
	append using `T`cnt''
}
cap destring pmid , replace force
drop if pmid == .

gduplicates drop

tempfile T_authorid_pmid
save `T_authorid_pmid', replace



*-------------------------------------------------------------------------------*
* EDITOR INFO
*-------------------------------------------------------------------------------*
use "`direc_data_clean'/editor_info.dta", clear

keep author_id journal_nlmid editor_startyear
gduplicates drop

* BRING IN PMIDS
joinby author_id using `T_authorid_pmid'

* BRING IN AFFILS
joinby author_id using "`direc_data_clean'/authid_affil.dta"

* BRING IN PMID INFO
joinby pmid using "`direc_data_clean'/pmid_info.dta"
drop journal pmid_cittot pmid_citpry
gduplicates drop

keep if pmid_year < editor_startyear
drop pmid_year editor_startyear

egen N_affil = sum(affil_count), by(author_id journal_nlmid pmid)
gen frcnt = affil_count / N_affil
drop N_affil

gen frcnt_all = frcnt * 1
gen frcnt_art = frcnt * pmid_art
gen frcnt_oth = frcnt * pmid_oth
drop frcnt

collapse (sum) frcnt_* , by(author_id journal_nlmid affil_term)

foreach l_var of varlist frcnt_* {
	egen denom = sum(`l_var') , by(author_id journal_nlmid)
	gen eshr1_`l_var' = `l_var'/denom
	recode eshr1_`l_var' .=0
	
	drop denom
	
	gen eshr2_`l_var' = `l_var' // will make shares later
}

keep author_id journal_nlmid affil_term eshr* 

preserve
	collapse (sum) eshr* , by(author_id journal_nlmid)
	foreach l_var of varlist eshr* {
		su `l_var'
		su `l_var' if `l_var' > 0 //TEST FOR SHARE = 1 for eshr1*
	}
restore

compress
save "`direc_data_clean'/e_shares_eja.dta", replace // not balanced



*-------------------------------------------------------------------------------*
* PREP ED-JOURNAL-YEAR
*-------------------------------------------------------------------------------*
use "`direc_data_clean'/editor_info.dta", clear

keep author_id
gduplicates drop

gen year = _n+1950
replace year = . if !(year >= 1950 & year <= 2008)
fillin author_id year
drop _f
keep if year >= 1950 & year <= 2008
drop if year == . 

joinby author_id using "`direc_data_clean'/editor_info.dta"

gcollapse (min) editor_startyear (max) editor_endyear , by(author_id journal_nlmid year)

keep if year >= editor_startyear & year <= editor_endyear


* MAKE EDITOR SHARES
joinby author_id journal_nlmid using "`direc_data_clean'/e_shares_eja.dta"

preserve
	collapse (sum) eshr* , by(author_id journal_nlmid year)
	foreach l_var of varlist eshr* {
		su `l_var'
		su `l_var' if `l_var' > 0 //TEST FOR SHARE = 1 for eshr1*
	}
restore

foreach l_var of varlist eshr* {
	egen denom = sum(`l_var'), by(journal_nlmid year)
	replace `l_var' = `l_var' / denom
	drop denom
}

collapse (sum) eshr* , by(journal_nlmid year affil_term)

preserve
	collapse (sum) eshr* , by(journal_nlmid year)
	foreach l_var of varlist eshr* {
		su `l_var'
		su `l_var' if `l_var' > 0 //TEST FOR SHARE = 1 for eshr1* and for eshr2*
	}
restore

sort journal_nlmid year affil_term 
compress
save "`direc_data_clean'/e_shares_ajt.dta", replace // not balanced



*-------------------------------------------------------------------------------*
* PREP JOURNAL INFO
*-------------------------------------------------------------------------------*
import delimited "`direc_data_raw'/journal_list.csv", clear varnames(1)

rename nlmid journal_nlmid
replace journal_nlmid = "0" + journal_nlmid if regexm(leading,"should have 0")
replace journal_nlmid = "00" + journal_nlmid if regexm(leading,"should have two 0")

keep journal journal_nlmid
foreach l_punc in . , : the {
	replace journal = subinstr(journal,"`l_punc'"," ",.)
}
replace journal = upper(itrim(trim(itrim(trim(journal)))))

gduplicates drop

compress
tempfile T_j
save `T_j', replace //journal journal_nlmid | unique @ journal


* PMIDS BY JOURNAL
use "`direc_data_clean'/pmid_info.dta" if pmid_year >= 1950 & pmid_year <= 2008, clear

joinby journal using `T_j' // only need affil for pmid of journals
drop journal
gduplicates drop

foreach l_var of varlist pmid_cittot pmid_citpry {
	replace `l_var' = 0 if `l_var'==. & journal_nlmid != ""
	replace `l_var' = . if journal_nlmid == ""
}


* BRING IN AUTHORIDS
joinby pmid using `T_authorid_pmid'

* BRING IN AFFILS
joinby author_id using "`direc_data_clean'/authid_affil.dta"

*CALC SHARES
drop if pmid == .

egen N_affil = sum(affil_count), by(pmid)
gen frcnt = affil_count / N_affil
drop N_affil

gen frcnt_all = frcnt * 1
gen frcntcit1_all = frcnt * 1 * pmid_cittot
gen frcntcit2_all = frcnt * 1 * pmid_citpry

gen frcnt_art = frcnt * pmid_art
gen frcntcit1_art = frcnt * pmid_art * pmid_cittot
gen frcntcit2_art = frcnt * pmid_art * pmid_citpry

gen frcnt_oth = frcnt * pmid_oth

rename pmid_year year
collapse (sum) frcnt* , by(affil_term journal_nlmid year)

foreach l_var of varlist frcnt* {
	egen denom = sum(`l_var') , by(journal_nlmid year)
	
	gen jshr_`l_var' = `l_var'/denom
	recode jshr_`l_var' .=0
	
	drop denom
}

preserve
	collapse (sum) jshr_* , by(journal_nlmid year)
	foreach l_var of varlist jshr_* {
		su `l_var'
		su `l_var' if `l_var' > 0 //TEST FOR SHARE = 1
	}
restore

compress
save "`direc_data_clean'/p_shares_ajt.dta", replace //unbalanced 



*-------------------------------------------------------------------------------*
* JOURNAL-YEAR-AFFIL FILLINED
*-------------------------------------------------------------------------------*
use "`direc_data_clean'/p_shares_ajt.dta", clear

keep journal_nlmid affil_term year

drop if affil_term == "" | year == .

fillin journal_nlmid affil_term year
drop _f

drop if year > 2008
drop if journal_nlmid == "9421642" & year < 1978
drop if journal_nlmid == "9419065" & year < 1993
drop if journal_nlmid == "9304532" & year < 1996
drop if journal_nlmid == "8303128" & year < 1981
drop if journal_nlmid == "8301365" & year < 1983
drop if journal_nlmid == "7501160" & year < 1980
drop if journal_nlmid == "2985191R" & year < 1978
drop if journal_nlmid == "1310650" & year < 1976
drop if journal_nlmid == "1300217" & year < 1975
drop if journal_nlmid == "0410462" & year < 1983
drop if journal_nlmid == "0372541" & year < 1981
drop if journal_nlmid == "0372354" & year < 1972
drop if journal_nlmid == "0372351" & year < 1969
drop if journal_nlmid == "0255562" & year < 1976
drop if journal_nlmid == "0217410" & year < 1975
drop if journal_nlmid == "0147763" & year < 1981
drop if journal_nlmid == "0047103" & year < 1980
  
drop if journal_nlmid == "0401260" & year < 1964
drop if journal_nlmid == "" & year < 1964

//based on pub data
drop if journal_nlmid == "9419065" & year < 1998
drop if journal_nlmid == "0410462" & year < 1988 

preserve

	import delimited "`direc_data_raw'/journal_list.csv", clear varnames(1)
	rename nlmid journal_nlmid
	replace journal_nlmid = "0" + journal_nlmid if regexm(leading,"should have 0")
	replace journal_nlmid = "00" + journal_nlmid if regexm(leading,"should have two 0")
	
	keep journal_nlmid journal
	rename journal journal_name
	gen len = strlen(journal_name)
	sort journal_nlmid len
	by journal_nlmid: drop if _n > 1
	keep journal_nlmid journal_name
	tempfile Tj
	save `Tj', replace
restore
joinby journal_nlmid using `Tj', unmatched(master) //keep out-of-sample
drop _m 

compress
save "`direc_data_clean'/ajt_base.dta" , replace



*-------------------------------------------------------------------------------*
* REG DATA BUILD AGG LOOP
*-------------------------------------------------------------------------------*
use "`direc_data_clean'/ajt_base.dta", clear

joinby journal_nlmid affil_term year using "`direc_data_clean'/e_shares_ajt.dta", unmatched(master)
drop _m

joinby journal_nlmid affil_term year using "`direc_data_clean'/p_shares_ajt.dta", unmatched(master)
drop _m

*REMOVE NATURE, HEALTH AFFAIRS, J NEURO NEURO PSYCH SINCE BAD EDITOR DATA
drop if journal_nlmid=="8303128" | journal_nlmid=="2985191R" | journal_nlmid=="0410462"
	
foreach l_var of varlist es* js* {
	recode `l_var' .=0
}

egen fe_j = group(journal_name)
gen fe_t = year
egen fe_a = group(affil_term)

egen fe_ja = group(fe_j fe_a)
egen fe_jt = group(fe_j fe_t)
egen fe_at = group(fe_a fe_t)
	
compress

save "`direc_data_clean'/ajt_regready.dta", replace







